""""
杭州市政策法规-字段：标题、日期、内容
"""


import requests
import re

from lxml import etree

from gzproject.gz_spider.gz_spider import db

res = requests.get('http://ghzy.hangzhou.gov.cn/col/col1229368367/index.html?uid=6405289&pageNum=1')
data = res.content.decode('utf-8')
urls = re.findall('href="(.*?)" ', data)

url_list = []
for url in urls:
    if url.startswith('http'):
        url_list.append(url)

for detail_url in url_list:
    r = requests.get(detail_url)
    data = r.content.decode('utf-8')
    xpath_data = etree.HTML(data)
    title = xpath_data.xpath('//p[@class="con-title"]//text()')
    date = xpath_data.xpath('//span[@class="date"]/text()')
    contents = xpath_data.xpath('//div[@class="main-txt"]//text()')
    content = ''.join(contents).strip().replace(r'\n\t', '').replace(r' ', '')
    db.db_insert(''.join(title),''.join(date),''.join(content))
